{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "1000\n",
      "1000\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "trainset.data = trainset.data[:1000]\n",
    "trainset.target = trainset.target[:1000]\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ONEHOT = np.zeros((len(trainset.data),len(trainset.target_names)))\n",
    "ONEHOT[np.arange(len(trainset.data)),trainset.target] = 1.0\n",
    "train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(trainset.data, \n",
    "                                                                               trainset.target, \n",
    "                                                                               ONEHOT, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 4999\n",
      "Most common words [('the', 899), ('a', 691), ('and', 655), ('of', 587), ('to', 366), ('is', 323)]\n",
      "Sample data [4, 948, 9, 835, 8, 22, 4, 3076, 2172, 91] ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'centurys', 'new']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(trainset.data).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "GO = dictionary['GO']\n",
    "PAD = dictionary['PAD']\n",
    "EOS = dictionary['EOS']\n",
    "UNK = dictionary['UNK']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self, size_layer, num_layers, embedded_size, batch_size,\n",
    "                 from_dict_size, to_dict_size, grad_clip=5.0):\n",
    "        self.size_layer = size_layer\n",
    "        self.num_layers = num_layers\n",
    "        self.embedded_size = embedded_size\n",
    "        self.grad_clip = grad_clip\n",
    "        self.from_dict_size = from_dict_size\n",
    "        self.to_dict_size = to_dict_size\n",
    "        self.batch_size = batch_size\n",
    "        self.model = tf.estimator.Estimator(self.model_fn)\n",
    "        \n",
    "    def lstm_cell(self, reuse=False):\n",
    "        return tf.nn.rnn_cell.LSTMCell(self.size_layer, reuse=reuse)\n",
    "    \n",
    "    def seq2seq(self, x_dict):\n",
    "        x = x_dict['x']\n",
    "        x_seq_len = x_dict['x_len']\n",
    "        with tf.variable_scope('encoder'):\n",
    "            encoder_embedding = tf.get_variable('encoder_embedding', \n",
    "                                                [self.from_dict_size, self.embedded_size], \n",
    "                                                tf.float32, tf.random_uniform_initializer(-1.0, 1.0))\n",
    "            _, encoder_state = tf.nn.dynamic_rnn(\n",
    "                cell = tf.nn.rnn_cell.MultiRNNCell([self.lstm_cell() for _ in range(self.num_layers)]), \n",
    "                inputs = tf.nn.embedding_lookup(encoder_embedding, x),\n",
    "                sequence_length = x_seq_len,\n",
    "                dtype = tf.float32)\n",
    "            encoder_state = tuple(encoder_state[-1] for _ in range(self.num_layers))\n",
    "        y = x_dict['y']\n",
    "        y_seq_len = x_dict['y_len']\n",
    "        with tf.variable_scope('decoder'):\n",
    "            decoder_embedding = tf.get_variable(\n",
    "                'decoder_embedding', [self.to_dict_size, self.embedded_size], tf.float32,\n",
    "                tf.random_uniform_initializer(-1.0, 1.0))\n",
    "            helper = tf.contrib.seq2seq.TrainingHelper(\n",
    "                inputs = tf.nn.embedding_lookup(decoder_embedding, y),\n",
    "                sequence_length = y_seq_len,\n",
    "                time_major = False)\n",
    "            decoder = tf.contrib.seq2seq.BasicDecoder(\n",
    "                cell = tf.nn.rnn_cell.MultiRNNCell([self.lstm_cell() for _ in range(self.num_layers)]),\n",
    "                helper = helper,\n",
    "                initial_state = encoder_state,\n",
    "                output_layer = tf.layers.Dense(len(trainset.target_names)))\n",
    "            decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(\n",
    "                decoder = decoder,\n",
    "                impute_finished = True,\n",
    "                maximum_iterations = tf.reduce_max(y_seq_len))\n",
    "            return decoder_output.rnn_output[:,-1]\n",
    "            \n",
    "    def model_fn(self, features, labels, mode):\n",
    "        logits = self.seq2seq(features)\n",
    "        if mode == tf.estimator.ModeKeys.PREDICT:\n",
    "            return tf.estimator.EstimatorSpec(mode, predictions=logits)\n",
    "        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels))\n",
    "        params = tf.trainable_variables()\n",
    "        gradients = tf.gradients(cost, params)\n",
    "        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.grad_clip)\n",
    "        train_op = tf.train.AdamOptimizer().apply_gradients(zip(clipped_gradients, params),\n",
    "                                                            global_step=tf.train.get_global_step())\n",
    "        acc_op = tf.metrics.accuracy(labels=tf.argmax(labels,1), predictions=tf.argmax(logits,1))\n",
    "        estim_specs = tf.estimator.EstimatorSpec(\n",
    "            mode = mode,\n",
    "            predictions = tf.argmax(logits,1),\n",
    "            loss = cost,\n",
    "            train_op = train_op,\n",
    "            eval_metric_ops = {'accuracy': acc_op})\n",
    "        return estim_specs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Using default config.\n",
      "WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpcpjzbyns\n",
      "INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_is_chief': True, '_tf_random_seed': None, '_save_summary_steps': 100, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_master': '', '_num_worker_replicas': 1, '_session_config': None, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7feb9a120160>, '_task_id': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_model_dir': '/tmp/tmpcpjzbyns'}\n"
     ]
    }
   ],
   "source": [
    "size_layer = 256\n",
    "num_layers = 2\n",
    "embedded_size = 256\n",
    "batch_size = len(train_X)\n",
    "maxlen = 50\n",
    "skip = 5\n",
    "model = Model(size_layer, num_layers, embedded_size, batch_size,\n",
    "                vocabulary_size + 4, vocabulary_size + 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_x = str_idx(train_X,dictionary,maxlen).astype(np.int32)\n",
    "batch_y = batch_x[:,skip:]\n",
    "seq_x = np.array([maxlen] * len(train_X)).astype(np.int32)\n",
    "seq_y = np.array([maxlen-skip] * len(train_X)).astype(np.int32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Create CheckpointSaverHook.\n",
      "INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpcpjzbyns/model.ckpt.\n",
      "INFO:tensorflow:step = 1, loss = 0.6710061\n",
      "INFO:tensorflow:Saving checkpoints for 10 into /tmp/tmpcpjzbyns/model.ckpt.\n",
      "INFO:tensorflow:Loss for final step: 1.20619625e-05.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.estimator.estimator.Estimator at 0x7feb9a172fd0>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_fn = tf.estimator.inputs.numpy_input_fn(\n",
    "            x={'x':batch_x, 'x_len':seq_x, 'y':batch_y, 'y_len':seq_y}, y=train_onehot,\n",
    "            batch_size=batch_size, num_epochs=10, shuffle=False)\n",
    "model.model.train(input_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
